home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyc (Python 2.6)
-
- """
- RDFa parser.
-
- RDFa is a set of attributes used to embed RDF in XHTML. An important goal of
- RDFa is to achieve this RDF embedding without repeating existing XHTML content
- when that content is the metadata.
-
- REFERENCES:
-
- \thttp://www.w3.org/2001/sw/BestPractices/HTML/2005-rdfa-syntax
-
- LICENSE:
-
- BSD
-
- CHANGE HISTORY:
-
- 2006/06/03 - Initial Version
- 2006/06/08 - Added support for role (as per primer not syntax spec)
- Added support for plaintext and flattening of XMLLiterals
- ... (Sections 5.1.1.2 and 5.1.2.1)
- Fixed plaintext bug where it was being resolved as CURIE
- Added support to skip reserved @rel keywords from:
- http://www.w3.org/TR/REC-html40/types.html#h-6.12
- 2006/08/12 - Changed reserved @rel resolution to include a '#'
- Fixed subject resolution for LINK/META when inside HEAD
- Fixed blank node extraction [_:address] -> [_:_:address]
- Added support for passing prefix mappings to the Graph
- via RDFaSink
- Added @id support as part of subject resolution
-
- Copyright (c) 2006, Elias Torres <elias@torrez.us>
-
- """
- import sys
- import re
- import urllib
- import urlparse
- import cStringIO
- import string
- from xml.dom import pulldom
- from rdflib.syntax.parsers import Parser
- from rdflib.Graph import ConjunctiveGraph
- from rdflib import URIRef
- from rdflib import BNode
- from rdflib import Literal
- from rdflib import Namespace
- __version__ = '$Id: RDFaParser.py 1072 2007-03-30 18:12:54Z eliast $'
- rdfa_attribs = [
- 'about',
- 'property',
- 'rel',
- 'rev',
- 'href',
- 'content',
- 'role',
- 'id']
- reserved_links = [
- 'alternate',
- 'stylesheet',
- 'start',
- 'next',
- 'prev',
- 'contents',
- 'index',
- 'glossary',
- 'copyright',
- 'chapter',
- 'section',
- 'subsection',
- 'appendix',
- 'help',
- 'bookmark']
- xhtml = Namespace('http://www.w3.org/1999/xhtml')
- xml = Namespace('http://www.w3.org/XML/1998/namespace')
- rdf = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
-
- class RDFaSink(object):
-
- def __init__(self, graph):
- self.graph = graph
-
-
- def __str__(self):
- return self.graph.serialize(format = 'pretty-xml')
-
-
- def triple(self, s, p, o):
- self.graph.add((s, p, o))
-
-
- def prefix(self, prefix, ns):
- self.graph.bind(prefix, ns, override = False)
-
-
- _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
-
- def _urljoin(base, uri):
- uri = _urifixer.sub('\\1\\3', uri)
- return urlparse.urljoin(base, uri)
-
-
- class RDFaParser(Parser):
-
- def __init__(self):
- self.lang = None
- self.abouts = []
- self.xmlbases = []
- self.langs = []
- self.elementStack = [
- None]
- self.bcounter = { }
- self.bnodes = { }
- self.sink = None
-
-
- def parse(self, source, sink, baseURI = None):
- self.sink = RDFaSink(sink)
- self.triple = self.sink.triple
- self.prefix = self.sink.prefix
- if not baseURI:
- pass
- self.baseuri = source.getPublicId()
- f = source.getByteStream()
- events = pulldom.parse(f)
- self.handler = events.pulldom
- for None in events:
- (event, node) = None
- if event == pulldom.START_DOCUMENT:
- self.abouts += [
- (URIRef(''), node)]
-
- if event == pulldom.END_DOCUMENT:
- if not len(self.elementStack) == 0:
- raise AssertionError
-
- if event == pulldom.START_ELEMENT:
- self.elementStack += [
- node]
- found = (filter,)((lambda x: x in node.attributes.keys()), rdfa_attribs)
- if not node.getAttributeNS(xml, 'base') and node.getAttribute('xml:base'):
- pass
- baseuri = self.baseuri
- self.baseuri = _urljoin(self.baseuri, baseuri)
- self.xmlbases.append(self.baseuri)
- if node.hasAttributeNS(xml, 'lang') or node.hasAttribute('xml:lang'):
- if not node.getAttributeNS(xml, 'lang'):
- pass
- lang = node.getAttribute('xml:lang')
- if lang == '':
- lang = None
-
- else:
- lang = self.lang
- self.lang = lang
- self.langs.append(lang)
- if len(found) == 0:
- continue
-
- parentNode = self.elementStack[-2]
- if 'about' in found:
- self.abouts += [
- (self.extractCURIEorURI(node.getAttribute('about')), node)]
- elif 'id' in found:
- self.abouts += [
- (self.extractCURIEorURI('#' + node.getAttribute('id')), node)]
-
- subject = self.abouts[-1][0]
- if node.tagName == 'meta' or node.tagName == 'link':
- if 'about' not in found and parentNode:
- if parentNode and parentNode.tagName == 'head':
- subject = URIRef('')
- elif parentNode.hasAttribute('about'):
- subject = self.extractCURIEorURI(parentNode.getAttribute('about'))
- elif parentNode.hasAttributeNS(xml, 'id') or parentNode.hasAttribute('id'):
- if not parentNode.getAttributeNS(xml, 'id'):
- pass
- id = parentNode.getAttribute('id')
- subject = self.extractCURIEorURI('#' + id)
- else:
- subject = self.generateBlankNode(parentNode)
-
-
- if 'property' in found:
- predicate = self.extractCURIEorURI(node.getAttribute('property'))
- literal = None
- datatype = None
- plaintext = False
- if node.hasAttribute('datatype'):
- sdt = node.getAttribute('datatype')
- if sdt != 'plaintext':
- datatype = self.extractCURIEorURI(sdt)
- else:
- plaintext = True
-
- if node.hasAttribute('content'):
- literal = Literal(node.getAttribute('content'), lang = lang, datatype = datatype)
- else:
- events.expandNode(node)
- self._popStacks(event, node)
- content = ''
- for child in node.childNodes:
- if datatype or plaintext:
- content += self._getNodeText(child)
- continue
- content += child.toxml()
-
- content = content.strip()
- if not datatype:
- pass
- literal = Literal(content, datatype = rdf.XMLLiteral)
- if literal:
- self.triple(subject, predicate, literal)
-
-
- if 'rel' in found:
- rel = node.getAttribute('rel').strip()
- if string.lower(rel) in reserved_links:
- rel = xhtml['#' + string.lower(rel)]
-
- predicate = self.extractCURIEorURI(rel)
- if node.hasAttribute('href'):
- object = self.extractCURIEorURI(node.getAttribute('href'))
- self.triple(subject, predicate, object)
-
-
- if 'rev' in found:
- predicate = self.extractCURIEorURI(node.getAttribute('rev'))
- if node.hasAttribute('href'):
- object = self.extractCURIEorURI(node.getAttribute('href'))
- self.triple(object, predicate, subject)
-
-
- if 'role' in found:
- type = self.extractCURIEorURI(node.getAttribute('role'))
- self.triple(subject, rdf.type, type)
-
-
- if event == pulldom.END_ELEMENT:
- self._popStacks(event, node)
- continue
-
- for nsc in self.handler._ns_contexts:
- for ns, prefix in nsc.items():
- self.prefix(prefix, ns)
-
-
- f.close()
-
-
- def _getNodeText(self, node):
- if node.nodeType in (3, 4):
- return node.nodeValue
- text = ''
- for child in node.childNodes:
- if child.nodeType in (3, 4):
- text = text + child.nodeValue
- continue
- node.nodeType in (3, 4)
-
- return text
-
-
- def generateBlankNode(self, parentNode):
- name = parentNode.tagName
- if self.bnodes.has_key(parentNode):
- return self.bnodes[parentNode]
- if self.bcounter.has_key(name):
- self.bcounter[name] = self.bcounter[name] + 1
- else:
- self.bcounter[name] = 0
- self.bnodes[parentNode] = BNode('%s%d' % (name, self.bcounter[name]))
- return self.bnodes[parentNode]
-
-
- def extractCURIEorURI(self, resource):
- if len(resource) > 0 and resource[0] == '[' and resource[-1] == ']':
- resource = resource[1:-1]
-
- if resource.find(':') > -1:
- (rpre, rsuf) = resource.split(':', 1)
- for nsc in self.handler._ns_contexts:
- for ns, prefix in nsc.items():
- if prefix == rpre:
- resource = ns + rsuf
- continue
-
-
-
- if len(resource) > 0 and resource[0:2] == '_:':
- return BNode(resource[2:])
- return URIRef(self.resolveURI(resource))
-
-
- def resolveURI(self, uri):
- if not self.baseuri:
- pass
- return _urljoin('', uri)
-
-
- def _popStacks(self, event, node):
- if len(self.abouts) != 0:
- (about, aboutnode) = self.abouts[-1]
- if aboutnode == node:
- self.abouts.pop()
-
-
- self.elementStack.pop()
- if self.xmlbases:
- self.xmlbases.pop()
- if self.xmlbases and self.xmlbases[-1]:
- self.baseuri = self.xmlbases[-1]
-
-
- if self.langs:
- self.langs.pop()
- if self.langs and self.langs[-1]:
- self.lang = self.langs[-1]
-
-
-
-
- if __name__ == '__main__':
- store = ConjunctiveGraph()
- store.load(sys.argv[1], format = 'rdfa')
- print store.serialize(format = 'pretty-xml')
-
-